## This file downloads and formats the data used for the voter turnout example
## and creates objects needed for creating graphs


# data -------------------
# Download from dataverse
url <- 'https://dataverse.harvard.edu/api/access/datafile/4030672'
dep <- read.table(url, sep = '\t', header = TRUE)

df <- dep[, -c(1:20, which(colnames(dep)=="vote18"))]  # remove outcome and component questions used to measure depression
X <- df[, sapply(1:ncol(df), FUN=function(i) length(unlist(unique(df[, i])))==2)]
Y <- dep$vote18

# Discretize
X$depression_low <- discretize(df$depression, 1/3)
X$depression_lowmed <- discretize(df$depression, 2/3)
X$depression_medhigh <- as.numeric(!X$depression_low)
X$depression_high <- as.numeric(!X$depression_lowmed)
X$age_low <- discretize(df$age, 1/3)
X$age_lowmed <- discretize(df$age, 2/3)
X$age_medhigh <- as.numeric(!X$age_low)
X$age_high <- as.numeric(!X$age_lowmed)
X$educ_low <- as.numeric(df$educ <= 2)
X$educ_lowmed <- as.numeric(df$educ <= 5)
X$educ_medhigh <- as.numeric(!X$educ_low)
X$educ_high <- as.numeric(!X$educ_lowmed)
X$inc_low <- as.numeric(df$income <= 4)
X$inc_lowmed <- as.numeric(df$income <= 9)
X$inc_medhigh <- as.numeric(!X$inc_low)
X$inc_high <- as.numeric(!X$inc_lowmed)
X$attend_low <- as.numeric(df$attend <= 2)
X$attend_lowmed <- as.numeric(df$attend <= 3)
X$attend_medhigh <- as.numeric(!X$attend_low)
X$attend_high <- as.numeric(!X$attend_lowmed)
X$polint_low <- as.numeric(df$polint <= .5)
X$polint_high <- as.numeric(!X$polint_low)
X$inteff1_low <- as.numeric(df$inteff1 <= .25)
X$inteff1_lowmed <- as.numeric(df$inteff1 <= .5)
X$inteff1_medhigh <- as.numeric(!X$inteff1_low)
X$inteff1_high <- as.numeric(!X$inteff1_lowmed)
X$inteff2_low <- as.numeric(df$inteff2 <= .25)
X$inteff2_lowmed <- as.numeric(df$inteff2 <= .5)
X$inteff2_medhigh <- as.numeric(!X$inteff2_low)
X$inteff2_high <- as.numeric(!X$inteff2_lowmed)
X$motivation_low <- discretize(df$motivation, 1/3)
X$motivation_lowmed <- discretize(df$motivation, 2/3)
X$motivation_medhigh <- as.numeric(!X$motivation_low)
X$motivation_high <- as.numeric(!X$motivation_lowmed)


## Objects that will be needed for making plots ------------
# all feature names (without values) as they appear in X and corresponding labels
feats <- unique(sapply(strsplit(colnames(X), "_"), function(x) x[1]))
flabels <- c("Female", "Hispanic", "Black", "Married", "Unemployed",
             "Depression", "Age", "Education", "Income", "Religious Attendance",
             "Political Interest", "Int. Eff. #1", "Int. Eff. #2",
             "Motivation")
# values as they appear in X and their labels
vals <- c("low", "lowmed", "medhigh", "high")
vlabels <- c("low", "low or med", "med or high", "high")

# Features as they appear in X and their corresponding labels
# (see description of getLabel in helper.R; fdf corresponds to the input labels_df)
fdf <- cbind(colnames(X), sapply(colnames(X), function(x) get_label_value(x, feats, flabels, vals, vlabels)))
# Equivalence classes 
# (see description of the simplifyCondition function in helper.R)
oppmat <- rbind(c("low", "high"),
                c("low", "medhigh"),
                c("high", "lowmed"))
oppind <- list(c("polint"),
               feats[c(6:10, 12:14)],
               feats[c(6:10, 12:14)])

# Feature names (without values) as they appear in X and their corresponding labels
# (see description of .get_df_chord in helper.R; fgs corresponds to the input featureGroups)
fgs <- unique(unlist(lapply(strsplit(colnames(X), "_"), function(x) x[1])))
featGroups <- cbind.data.frame(fgs, "lab"=sapply(fgs, function(x) get_label_value(x, fgs, flabels, NA, NA)))